Assignment 1

Author

Erin Cagle

Load Packages

library(data.table)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':

    between, first, last
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(cowplot)
.<- list

Data Exploration

Load data from EPA Website for 2002 and 2022

two_url <- "https://www3.epa.gov/cgi-bin/broker?_service=data&_server=134.67.99.91&_port=4089&_sessionid=Gkf0VeQyR52&_PROGRAM=dataprog.ad_viz_plotval_getdata.sas"
two <- read.table(two_url, header = TRUE, sep = ",")
ttwo_url <- "https://www3.epa.gov/cgi-bin/broker?_service=data&_server=134.67.99.91&_port=4079&_sessionid=JUXhIeQyR52&_PROGRAM=dataprog.ad_viz_plotval_getdata.sas"
ttwo <- read.table(ttwo_url, header = TRUE, sep = ",")

Explore data tables for each year

dim(two)
[1] 15976    20
dim(ttwo)
[1] 56140    20
head(two)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    UNITS
1 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
  DAILY_AQI_VALUE Site.Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
1              78 Livermore               1              100              88101
2              92 Livermore               1              100              88101
3              71 Livermore               1              100              88101
4              80 Livermore               1              100              88101
5              98 Livermore               1              100              88101
6             115 Livermore               1              100              88101
        AQS_PARAMETER_DESC CBSA_CODE                         CBSA_NAME
1 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
2 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
3 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
4 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
5 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
6 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
  STATE_CODE      STATE COUNTY_CODE  COUNTY SITE_LATITUDE SITE_LONGITUDE
1          6 California           1 Alameda      37.68753      -121.7842
2          6 California           1 Alameda      37.68753      -121.7842
3          6 California           1 Alameda      37.68753      -121.7842
4          6 California           1 Alameda      37.68753      -121.7842
5          6 California           1 Alameda      37.68753      -121.7842
6          6 California           1 Alameda      37.68753      -121.7842
head(ttwo)
        Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    UNITS
1 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
  DAILY_AQI_VALUE Site.Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
1              52 Livermore               1              100              88101
2              55 Livermore               1              100              88101
3              30 Livermore               1              100              88101
4              15 Livermore               1              100              88101
5              18 Livermore               1              100              88101
6              16 Livermore               1              100              88101
        AQS_PARAMETER_DESC CBSA_CODE                         CBSA_NAME
1 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
2 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
3 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
4 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
5 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
6 PM2.5 - Local Conditions     41860 San Francisco-Oakland-Hayward, CA
  STATE_CODE      STATE COUNTY_CODE  COUNTY SITE_LATITUDE SITE_LONGITUDE
1          6 California           1 Alameda      37.68753      -121.7842
2          6 California           1 Alameda      37.68753      -121.7842
3          6 California           1 Alameda      37.68753      -121.7842
4          6 California           1 Alameda      37.68753      -121.7842
5          6 California           1 Alameda      37.68753      -121.7842
6          6 California           1 Alameda      37.68753      -121.7842
tail(two)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    UNITS
15971 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
15972 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
15973 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
15974 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
15975 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
15976 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
      DAILY_AQI_VALUE            Site.Name DAILY_OBS_COUNT PERCENT_COMPLETE
15971              57 Woodland-Gibson Road               1              100
15972              57 Woodland-Gibson Road               1              100
15973               4 Woodland-Gibson Road               1              100
15974              74 Woodland-Gibson Road               1              100
15975              21 Woodland-Gibson Road               1              100
15976              25 Woodland-Gibson Road               1              100
      AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
15971              88101 PM2.5 - Local Conditions     40900
15972              88101 PM2.5 - Local Conditions     40900
15973              88101 PM2.5 - Local Conditions     40900
15974              88101 PM2.5 - Local Conditions     40900
15975              88101 PM2.5 - Local Conditions     40900
15976              88101 PM2.5 - Local Conditions     40900
                                    CBSA_NAME STATE_CODE      STATE COUNTY_CODE
15971 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
15972 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
15973 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
15974 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
15975 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
15976 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
      COUNTY SITE_LATITUDE SITE_LONGITUDE
15971   Yolo      38.66121      -121.7327
15972   Yolo      38.66121      -121.7327
15973   Yolo      38.66121      -121.7327
15974   Yolo      38.66121      -121.7327
15975   Yolo      38.66121      -121.7327
15976   Yolo      38.66121      -121.7327
tail(ttwo)
            Date Source  Site.ID POC Daily.Mean.PM2.5.Concentration    UNITS
56135 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
56136 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
56137 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
56138 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
56139 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
56140 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
      DAILY_AQI_VALUE            Site.Name DAILY_OBS_COUNT PERCENT_COMPLETE
56135              14 Woodland-Gibson Road               1              100
56136              16 Woodland-Gibson Road               1              100
56137              25 Woodland-Gibson Road               1              100
56138              99 Woodland-Gibson Road               1              100
56139              74 Woodland-Gibson Road               1              100
56140               4 Woodland-Gibson Road               1              100
      AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
56135              88101 PM2.5 - Local Conditions     40900
56136              88101 PM2.5 - Local Conditions     40900
56137              88101 PM2.5 - Local Conditions     40900
56138              88101 PM2.5 - Local Conditions     40900
56139              88101 PM2.5 - Local Conditions     40900
56140              88101 PM2.5 - Local Conditions     40900
                                    CBSA_NAME STATE_CODE      STATE COUNTY_CODE
56135 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
56136 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
56137 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
56138 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
56139 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
56140 Sacramento--Roseville--Arden-Arcade, CA          6 California         113
      COUNTY SITE_LATITUDE SITE_LONGITUDE
56135   Yolo      38.66121      -121.7327
56136   Yolo      38.66121      -121.7327
56137   Yolo      38.66121      -121.7327
56138   Yolo      38.66121      -121.7327
56139   Yolo      38.66121      -121.7327
56140   Yolo      38.66121      -121.7327
str(two)
'data.frame':   15976 obs. of  20 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily.Mean.PM2.5.Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  78 92 71 80 98 115 87 57 65 107 ...
 $ Site.Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
str(ttwo)
'data.frame':   56140 obs. of  20 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily.Mean.PM2.5.Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  52 55 30 15 18 16 10 29 54 47 ...
 $ Site.Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...

Both sets of data have 20 columns, but 2022 has a much larger number of rows. Data is sorted by date, from January to December. There is character, integer, and numeric data in these frames. There does not appear to be any variable with all missing values.

Combining Data

total <- rbind(two, ttwo)
total_byyear <- total |>
  mutate(
    year = as.character(substring(Date, 7, 10)),
    month = as.character(substring(Date, 1, 2)),
    day = as.character(substring(Date, 4, 5))
  )
newtot <- total_byyear[,c("Site.ID", "Daily.Mean.PM2.5.Concentration", "DAILY_AQI_VALUE", "Site.Name", "CBSA_NAME", "COUNTY", "SITE_LATITUDE", "SITE_LONGITUDE", "year", "month", "day")]

colnames(newtot) <- c("Site", "PM2.5", "AQI", "Name", "CBSA", "County", "Lat", "Lon", "Yr", "Mon", "Day") 

Extract the columns from the combined data set that will be used later. Rename those columns to create better names for indexing and using those columns.

Mapping with Leaflet

library(leaflet)
location <- newtot[,c("Site", "Lat",  "Lon", "Yr")]
yr.pal <- colorFactor(c('navy', 'lightblue'), domain = location$Yr)


statmap <- leaflet(location) |>
  addProviderTiles('CartoDB.Positron') |>
  addCircles(
    lat = ~Lat, 
    lng = ~Lon,
    label = ~paste0(round(Site,2)), color = ~yr.pal(Yr),
    opacity = 1, fillOpacity = 1, radius = 500
  ) |>
  addLegend('bottomleft',pal = yr.pal, values=location$Yr,
          title='Year', opacity=1)
statmap

Since 2002, it appears that more sites have been added to the database. It also looks like the distribution of sites across California has improved since 2002. There seems to be larger concentration of sites near greater metro areas like Los Angeles and San Francisco.

Exploring PM2.5 Values

sum(is.na(newtot))
[1] 0
summary(newtot$PM2.5)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   -2.2     4.5     7.7    10.2    12.4   302.5 
summary(newtot$AQI)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   19.00   32.00   37.46   52.00  353.00 
summary_data <- newtot |>
  group_by(Yr) |>
  summarize(
    Mean = mean(PM2.5),
    Median = median(PM2.5),
    Sum = sum(PM2.5),
    Count = n()
  )

There are no missing values in the combined data set. Normal AQI ranges are from 0-500, so there does not appear to be any implausible data in that column. Normal ranges of PM2.5 are from less than 25 to greater than 300, therefore all values appear to be plausible. Based on the summary table, sorted by year, there appears to be some improvement in PM2.5 levels since 2002. In 2002, the mean was around 16 while it was about 8.5 in 2022. The median also changed from 12 to 6.9.

Visualizing Change

  ggplot(newtot) +
  geom_boxplot(mapping = aes(x = Yr, y = PM2.5, fill = Yr)) +
    labs(x = "Year", y = "PM 2.5 Daily Mean Concentration") +
    labs(title = "California PM 2.5 Concentrations: 2002 vs 2022")

While the mean PM2.5 levels across the state of CA decreased in the past 20 years, it appears that there is a greater range in the higher values of PM 2.5. While that average has decreased, the number of points at greater values has increased. This may be, in part, due to an increase in the number of sites from 2002 to 2022.

county.data02 <- two |>
  group_by(COUNTY) |>
  summarize(
    mean(Daily.Mean.PM2.5.Concentration)
  )
county.data22 <- ttwo |>
   group_by(COUNTY) |>
  summarize(
    mean(Daily.Mean.PM2.5.Concentration)
  )
colnames(county.data02) <- c("county", "pm2.5")
colnames(county.data22) <- c("county", "pm2.5")

ggplot(newtot) + geom_point(mapping = aes(x = Yr, y = PM2.5, color = County), position = "jitter") + labs(x = "Year", y = "PM2.5 Concentrations") + labs(title = "PM2.5 Concentrations By County in 2002 and 2022")

When sorted by county, it can be seen that most PM2.5 levels in 2002 were clustered together between 0-100. However, in 2022 the cluster got closer to 0, but there are more points further away from these lower levels. There are about 2 counties that have the majority of the data point further away from the main cluster of data.

newtotla <- newtot[(newtot$County == "Los Angeles"),]
ggplot(newtotla) + geom_bar(mapping = aes(x = PM2.5, fill = Yr), position = "dodge")

The data from Los Angeles County looks similar to the overall spread of California. The PM 2.5 levels at most sites in the county are concentrated to lower values. There is much less spread in the data from 2002 to 2022. The height of the histogram for 2022 could also suggest a greater number of samples may be being recorded within the county. Overall, it appears that PM 2.5 concentrations are trending towards a decrease in values.